### Alternative Measures of AQI

rm(list = ls())

### libraries and packages
pkg <- c("plyr", "dplyr", "tidyr", 
         "MASS", "multiwayvcov", 
         "stargazer", "lmtest", "doBy", "ggplot2", 
         "mediation", "vars", "DataCombine", "foreign",
         "mgcv", "ordinal", "reshape2", 
         "xtable", "sandwich", "rms")
lapply(pkg, require, character.only = TRUE)

# rid of E
options(scipen=999)
### setting wd for inputs
setwd("~/Dropbox/Pollution and Public Perceptions in China/Data and Analysis/Data")

# loading the data
load("ppp_cleaned")


#################################################
###### NOTE THE FOLLOWING CODE THAT MERGES IN US EMBASSY DATA FOR PM2.5 AND UNOFFICIAL DATA FOR AQI ######
#################################################

#################################################
###### MERGING IN US EMBASSY DATA FOR PM2.5 ######
#################################################

# cleaning the date in p2, making it an actual date variable
p2$begin_date <- as.Date(p2$begin_date, format = "%Y-%m-%d")
p2$begin_date

# loading PM (saved as a dta file from Embassy_Analysis.R)
setwd("~/Dropbox/Pollution and Public Perceptions in China/Data and Analysis/Data/PM2.5")
PM <- read.dta("PM.dta")

PM$begin_date <- as.Date(PM$begin_date, format = "%Y-%m-%d") # making PM's "begin_date" an actual date, not character
PM$begin_date

p2$educ <- as.numeric(as.character(p2$educ))
p2$income <- as.numeric(as.character(p2$income))
p2$age <- as.numeric(as.character(p2$age))
p2$educ_sd <- as.numeric(as.character(p2$educ_sd))
p2$income_sd <- as.numeric(as.character(p2$income_sd))
p2$age_sd <- as.numeric(as.character(p2$age_sd))

p3 <- left_join(p2, PM, c("begin_date" = "begin_date")) # merging PM with p2. The new variable for PM2.5 is called "value"

### creating lagged independent variables, value.lagged and value.lagged2
start <- 1 # setting up a "starting value" to facilitate the value generating scheme
span <- rep(NA, length(unique(p3$day))) # creating a span, assigning NAs to it
p3$value.lagged <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p2$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p3[p3$day == i,]) + start - 1 # today
  span.i2 <- nrow(p3[p3$day == i+1,]) + span.i # tomorrow
  
  value.to.be.copied <- unique(p3$value[start:span.i]) # from the starting value to the end of a day
  p3$value.lagged[(span.i+1):span.i2] <- value.to.be.copied # from the starting value of each day to the next
  
  start <- span.i + 1 # updating the starting value
} # we ignore all the error messages 

# Creating a variable for AQIlagged2
start <- 1 # setting up a "starting value" to facilitate the value generating scheme
span <- rep(NA, length(unique(p3$day))) # creating a span, assigning NAs to it
p3$value.lagged2 <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p3$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p3[p3$day == i,]) + start - 1 # today
  span.i2 <- nrow(p3[p3$day == i+1,]) + span.i # tomorrow
  
  value.to.be.copied <- unique(p3$value.lagged[start:span.i]) # from the starting value to the end of a day
  p3$value.lagged2[(span.i+1):span.i2] <- value.to.be.copied # from the starting value of each day to the next
  
  start <- span.i + 1 # updating the starting value
} # we ignore all the error messages 
p3$value.lagged2

# manually adding the lagged values for the first 2 days (PM 2.5)
p3$value.lagged[p3$day==1] <- 109
p3$value.lagged2[p3$day==1] <- 83
p3$value.lagged2[p3$day==2] <- 109

# standardize value to a value with mean 0 and standard deviation 1
p3$value <- (p3$value-mean(p3$value))/(sd(p3$value))
p3$value.lagged <- (p3$value.lagged-mean(p3$value.lagged))/(sd(p3$value.lagged))
p3$value.lagged2 <- (p3$value.lagged2-mean(p3$value.lagged2))/(sd(p3$value.lagged2))

### Running full specifications using new independent variables, value, value.lagged and value.lagged2
l_satc <- lm(l_sat
             ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
             + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov1 <-cluster.vcov(l_satc, p3$day)  
a <- coeftest(l_satc, m.vcov1) # significant

# value and c_sat
c_satc <- lm(c_sat
             ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
             + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov25 <-cluster.vcov(c_satc, p3$day)  
b <- coeftest(c_satc, m.vcov25) # p = .12

# value and the demand for more watchdog at the local level
l_watchc <- lm(l_watch
               ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
               + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov13 <-cluster.vcov(l_watchc, p3$day)  
c <- coeftest(l_watchc, m.vcov13) # significant

# value and the demand for more watchdog at the central level 
c_watchc <- lm(c_watch
               ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
               + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov14 <-cluster.vcov(c_watchc, p3$day)  
d <- coeftest(c_watchc, m.vcov14) # significant


# value and anti-west sentiment
anti_westc <- lm(anti_west
                 ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
                 + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov25 <-cluster.vcov(anti_westc, p3$day)  
f <- coeftest(anti_westc, m.vcov25) # significant p = .0510

# value and life stisfaction
life_satc <- lm(life_sat
                ~ value + value.lagged + value.lagged2 + hukou + insider + soe + educ 
                + kids + married + income + gender + ccp + parade + national + age, data=p3)
m.vcov25 <-cluster.vcov(life_satc, p3$day)  
g <- coeftest(life_satc, m.vcov25) # significant

#################################################
##### PM2.5 - CREATING A TABLE
#################################################

column.labels <- c("Local Government", "Central Government", "Local Oversight", "Central Oversight",  "Global Warming")
cov.labs <-c("PM 2.5", "PM 2.5 Lagged", "PM 2.5 Lagged 2","Hukou Status", "Regime Insider", "SOE Employee", "Education","Children","Married","Income", "Female", "CCP Member","Parade Period","Holiday","Age")
## TABLE A19
sink("~/Dropbox/Pollution and Public Perceptions in China/Manuscript/Tables/pm2.5.tex")
stargazer(l_satc, c_satc, l_watchc, c_watchc, anti_westc, font.size = "large", digits = 4, 
          column.labels = column.labels,
          covariate.labels = cov.labs,
          dep.var.labels.include = FALSE,
          model.numbers = FALSE,
          style = "qje", no.space = FALSE, keep.stat = c("n"),
          star.cutoffs = c(.10, .05, .01), 
          se = list(a[,2], b[,2], c[, 2], d[, 2], f[, 2]),
          header = F, notes = "robust standard errors clustered by day in parentheses", float = F)
sink()

#################################################
### MERGING IN DATA FOR UNOFFICIAL AQI ###
#################################################

setwd("~/Dropbox/Pollution and Public Perceptions in China/Data and Analysis/Data/")

# note that aqi_unofficial.csv was created by simply copying and pasting the tables from www.aqistudy.cn

###### loading the unofficial aqi data ######
unofficialaqi <- read.csv("aqi_unofficial.csv")
useful <- c("begin_date", "AQI") # subsetting out all the empty variables named X X1 X2 etc.
uaqi <- unofficialaqi[useful] # calling the new data uaqi

# fixing the date
uaqi$begin_date <- as.character(uaqi$begin_date) # first, making the date a character
uaqi$begin_date <- as.Date(uaqi$begin_date, format = "%Y/%m/%d") # then, making the character an actual date
uaqi$begin_date # checking to make sure that nothing went wrong

p4 <- left_join(p2, uaqi, c("begin_date" = "begin_date")) # merge uaqi with p2. The new variable is called "AQI"

### creating lagged independent variables, AQI.lagged and AQI.lagged2

start <- 1 # setting up a "starting AQI" to facilitate the AQI generating scheme
span <- rep(NA, length(unique(p4$day))) # creating a span, assigning NAs to it
p4$AQI.lagged <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p2$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p4[p4$day == i,]) + start - 1 # today
  span.i2 <- nrow(p4[p4$day == i+1,]) + span.i # tomorrow
  
  AQI.to.be.copied <- unique(p4$AQI[start:span.i]) # from the starting AQI to the end of a day
  p4$AQI.lagged[(span.i+1):span.i2] <- AQI.to.be.copied # from the starting AQI of each day to the next
  
  start <- span.i + 1 # updating the starting AQI
} # we ignore all the error messages 

# Creating a variable for AQIlagged2
start <- 1 # setting up a "starting AQI" to facilitate the AQI generating scheme
span <- rep(NA, length(unique(p4$day))) # creating a span, assigning NAs to it
p4$AQI.lagged2 <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p4$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p4[p4$day == i,]) + start - 1 # today
  span.i2 <- nrow(p4[p4$day == i+1,]) + span.i # tomorrow
  
  AQI.to.be.copied <- unique(p4$AQI.lagged[start:span.i]) # from the starting AQI to the end of a day
  p4$AQI.lagged2[(span.i+1):span.i2] <- AQI.to.be.copied # from the starting AQI of each day to the next
  
  start <- span.i + 1 # updating the starting AQI
} # we ignore all the error messages 
p4$AQI.lagged2

# manually adding lags 
p4$AQI.lagged[p4$day==1] <- 158
p4$AQI.lagged2[p4$day==1] <- 117
p4$AQI.lagged2[p4$day==2] <- 117

# standardize unofficial AQI with mean 0 and standard deviation 1
p4$AQI <- (p4$AQI-mean(p4$AQI))/(sd(p4$AQI))
p4$AQI.lagged <- (p4$AQI.lagged-mean(p4$AQI.lagged))/(sd(p4$AQI.lagged))
p4$AQI.lagged2 <- (p4$AQI.lagged2-mean(p4$AQI.lagged2))/(sd(p4$AQI.lagged2))

# running the full specification with AQI
# local government satisfaction
l_satc <- lm(l_sat
             ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
             + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov1 <-cluster.vcov(l_satc, p4$day)  
a <- coeftest(l_satc, m.vcov1) # significant

# AQI and c_sat
c_satc <- lm(c_sat
             ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
             + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov25 <-cluster.vcov(c_satc, p4$day)  
b <- coeftest(c_satc, m.vcov25) # not significant p = .107


# AQI and the demand for more watchdog at the local level
l_watchc <- lm(l_watch
               ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
               + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov13 <-cluster.vcov(l_watchc, p4$day)  
c <- coeftest(l_watchc, m.vcov13) # significant

# AQI and the demand for more watchdog at the central level 
c_watchc <- lm(c_watch
               ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
               + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov14 <-cluster.vcov(c_watchc, p4$day)  
d <- coeftest(c_watchc, m.vcov14) # significant 


# AQI and anti-west sentiment
anti_westc <- lm(anti_west
                 ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
                 + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov25 <-cluster.vcov(anti_westc, p4$day)  
f <- coeftest(anti_westc, m.vcov25) # significant p = .0800426


# AQI and life stisfaction
life_satc <- lm(life_sat
                ~ AQI + AQI.lagged + AQI.lagged2 + hukou + insider + soe + educ 
                + kids + married + income + gender + ccp + parade + national + age, data=p4)
m.vcov25 <-cluster.vcov(life_satc, p4$day)  
g <- coeftest(life_satc, m.vcov25) # significant

#################################################
##### Alternative/Unofficial (Chinese) AQI - CREATING A TABLE
#################################################

column.labels <- c("Local Government", "Central Government", "Local Oversight", "Central Oversight", "Global Warming")
cov.labs <-c("AQI", "AQI Lagged", "AQI Lagged 2","Hukou Status", "Regime Insider", "SOE Employee", "Education","Children","Married","Income", "Female", "CCP Member","Parade Period","Holiday","Age")

## TABLE A20
sink("~/Dropbox/Pollution and Public Perceptions in China/Manuscript/Tables/aqi.alt.tex")
stargazer(l_satc, c_satc, l_watchc, c_watchc, anti_westc, font.size = "large", digits = 4, 
          column.labels = column.labels,
          covariate.labels = cov.labs,
          dep.var.labels.include = FALSE,
          model.numbers = FALSE,
          style = "qje", no.space = FALSE, keep.stat = c("n"),
          star.cutoffs = c(.10, .05, .01), 
          se = list(a[,2], b[,2], c[, 2], d[, 2], f[, 2]),
          header = F, notes = "robust standard errors clustered by day in parentheses", float = F)
sink()
